import pandas as pd
import os
import re

pd.set_option('display.max_columns', None)
path = os.path.abspath('..') + os.sep + 'data' + os.sep + 'ted_main.csv'


def read_ted_data():
    dataFrame = pd.read_csv(path)

    dataFrame['comments'] = dataFrame['comments'].astype('int32')
    dataFrame['description'] = dataFrame['description'].str.lower().str.strip()
    dataFrame['duration'] = dataFrame['duration'].astype('int32')
    dataFrame['event'] = dataFrame['event'].astype('category').str.lower().str.strip()
    dataFrame['film_date'] = pd.to_datetime(dataFrame['film_date'], unit='s')
    dataFrame['languages'] = dataFrame['languages'].astype('int32')
    dataFrame['main_speaker'] = dataFrame['main_speaker'].str.lower().str.strip()
    dataFrame['name'] = dataFrame['name'].str.lower().str.strip()
    dataFrame['num_speaker'] = dataFrame['num_speaker'].astype('int32')
    dataFrame['published_date'] = pd.to_datetime(dataFrame['published_date'], unit='s')
    dataFrame['speaker_occupation'] = dataFrame['speaker_occupation'].astype('category').str.lower().str.strip()
    dataFrame['title'] = dataFrame['title'].str.lower().str.strip()

    tags_split = []

    for i in range(len(dataFrame["tags"])):
        temp = dataFrame["tags"][i]
        temp = temp.lstrip('[')
        temp = temp.rstrip(']')
        temp = temp.replace('\'', '')
        temp = temp.replace(' ', '')
        # print(temp)
        tags_split.append(temp.split(","))
    dataFrame["tags_split"] = tags_split

    return dataFrame
